Filtering criteria:

# Load Gandal's Dataset
if(file.exists('./../Data/Gandal_RNASeq.RData')){
  
  load('./../Data/Gandal_RNASeq.RData')
  
  datExpr_Gandal = datExpr %>% dplyr::select(which(colnames(datExpr) %in% rownames(datMeta[datMeta$Diagnosis_=='CTL',])))
  datMeta_Gandal = datMeta %>% filter(Diagnosis_=='CTL')
  datProbes_Gandal = datProbes
  DE_info_Gandal = DE_info
  
} else print('Gandal_RNASeq.RData does not exist. Find how to create it in 04_26_Gandal_RNASeq_exploratory_analysis.RData')

# Load BrainSpan's Dataset
if(file.exists('./../Data/BrainSpan_filtered.RData')){
  
  load('./../Data/BrainSpan_filtered.RData')
  
  datExpr_BrainSpan = datExpr
  datMeta_BrainSpan = datMeta
  datProbes_BrainSpan = datProbes
  DE_info_BrainSpan = DE_info
  
} else print('BrainSpan_raw.RData does not exist. Find how to create it in 05_14_BrainSpan_exploratory_analysis.RData')

rm(datExpr, datMeta, datProbes, DE_info)

# Probe comparison
paste0('After filtering, Gandal has ', nrow(datExpr_Gandal),' probes and BrainSpan has ', 
       nrow(datExpr_BrainSpan),', of which they share ',
       sum(rownames(datExpr_Gandal) %in% rownames(datExpr_BrainSpan)))
## [1] "After filtering, Gandal has 33478 probes and BrainSpan has 38790, of which they share 28654"
all_probes = unique(c(rownames(datExpr_Gandal), rownames(datExpr_BrainSpan)))

DE_genes_df = data.frame('Gandal' = all_probes %in% rownames(datExpr_Gandal),
                         'BrainSpan' = all_probes %in% rownames(datExpr_BrainSpan))
rownames(DE_genes_df) = all_probes

plot(venneuler(DE_genes_df))

# Filter to keep only common probes
datExpr_Gandal = datExpr_Gandal[rownames(datExpr_Gandal) %in% rownames(datExpr_BrainSpan),]
datProbes_Gandal = datProbes_Gandal[rownames(datProbes_Gandal) %in% rownames(datExpr_Gandal),]

datExpr_BrainSpan = datExpr_BrainSpan[rownames(datExpr_BrainSpan) %in% rownames(datExpr_Gandal),]
datProbes_BrainSpan = datProbes_BrainSpan[rownames(datProbes_BrainSpan) %in% rownames(datExpr_BrainSpan),]

# write.csv(rownames(datExpr_Gandal), file='./../Data/Gandal_BrainSpan_probes.csv', row.names=FALSE)

rm(all_probes, DE_genes_df)

Normalise Gandal data

Normalisation method: cqn from the cqn package (following Gandal’s preprocessing pipeline)

# Conditional Quantile Normalisation (CQN)
cqn.dat = cqn(datExpr_Gandal,lengths = as.numeric(datProbes_Gandal$length),
              x = as.numeric(datProbes_Gandal$percentage_gc_content),
              lengthMethod = c('smooth'), sqn = FALSE)
cqn.dat = cqn.dat$y + cqn.dat$offset  # Get the log2(Normalized FPKM) values
datExpr_Gandal = cqn.dat

# # VST Normalisation
# counts = as.matrix(datExpr_Gandal)
# rowRanges = GRanges(datProbes_Gandal$chromosome_name,
#                   IRanges(datProbes_Gandal$start_position, width=datProbes_Gandal$length),
#                   strand=datProbes_Gandal$strand,
#                   feature_id=datProbes_Gandal$ensembl_gene_id)
# se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_Gandal)
# dds = DESeqDataSet(se, design =~1)
# dds = estimateSizeFactors(dds)
# vst_output = vst(dds)
# datExpr_Gandal = assay(vst_output)
# 
# rm(counts, rowRanges, se, dds, vst_output)

rm(cqn.dat)

Create artificial binary classifier

Assigning a label to each subject, so different samples from the same subject share class, as it would happen with real ASD samples

Gandal dataset:

fake_class = data.frame('Subject_ID' = unique(datMeta_Gandal$Subject_ID),
                        'fake' = rbinom(length(unique(datMeta_Gandal$Subject_ID)), 1, 0.5))
datMeta_Gandal = datMeta_Gandal %>% left_join(fake_class, by='Subject_ID')

table(datMeta_Gandal$fake)
## 
##  0  1 
## 18 17

BrainSpan dataset:

fake_class = data.frame('donor_id' = unique(datMeta_BrainSpan$donor_id),
                        'fake' = rbinom(length(unique(datMeta_BrainSpan$donor_id)), 1, 0.5))
datMeta_BrainSpan = datMeta_BrainSpan %>% left_join(fake_class, by='donor_id')

table(datMeta_BrainSpan$fake)
## 
##   0   1 
## 112 189


Perform DEA

Following Gandal’s preprocessing script (limma lmFit), removing robust parameter to improve running time for BrainSpan

# Gandal dataset:
mod = model.matrix(~fake, data=datMeta_Gandal)
corfit = duplicateCorrelation(datExpr_Gandal, mod, block=datMeta_Gandal$Subject_ID)
lmfit = lmFit(datExpr_Gandal, design=mod, block=datMeta_Gandal$Subject_ID, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_Gandal))
new_DE_info_Gandal = top_genes[match(rownames(datExpr_Gandal), rownames(top_genes)),]

new_DE_info_Gandal$signif_p_val = new_DE_info_Gandal$adj.P.Val<0.05
new_DE_info_Gandal$signif_lfc = abs(new_DE_info_Gandal$logFC)>log2(1.2)
new_DE_info_Gandal$signif = new_DE_info_Gandal$signif_p_val & new_DE_info_Gandal$signif_lfc

new_DE_info_Gandal = new_DE_info_Gandal %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
                     dplyr::select(ID, logFC, AveExpr, P.Value, adj.P.Val, signif_p_val, signif_lfc, signif,
                                   `gene-score`, syndromic) %>% mutate(`gene-score` = as.factor(`gene-score`))

rm(mod, corfit, lmfit, fit, top_genes)
# BrainSpan dataset:
mod = model.matrix(~fake, data=datMeta_BrainSpan)
corfit = duplicateCorrelation(log2(datExpr_BrainSpan+1), mod, block=datMeta_BrainSpan$donor_id)
lmfit = lmFit(log2(datExpr_BrainSpan+1), design=mod, block=datMeta_BrainSpan$donor_id, correlation=corfit$consensus)

fit = eBayes(lmfit, trend=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_BrainSpan))
new_DE_info_BrainSpan = top_genes[match(rownames(datExpr_BrainSpan), rownames(top_genes)),]

new_DE_info_BrainSpan$signif_p_val = new_DE_info_BrainSpan$adj.P.Val<0.05
new_DE_info_BrainSpan$signif_lfc = abs(new_DE_info_BrainSpan$logFC)>log2(1.2)
new_DE_info_BrainSpan$signif = new_DE_info_BrainSpan$signif_p_val & new_DE_info_BrainSpan$signif_lfc

new_DE_info_BrainSpan = new_DE_info_BrainSpan %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
                        dplyr::select(ID, logFC, AveExpr, P.Value, adj.P.Val, signif_p_val, signif_lfc, signif,
                                      `gene-score`, syndromic) %>% mutate(`gene-score` = as.factor(`gene-score`))


rm(mod, corfit, lmfit, fit, top_genes)


Compare logFC by SFARI score

In the Gandal dataset the higher the SFARI score, the lower the logFC, but this pattern is not as clear in the BrainSpan dataset. This could be because the standard deviation in BrainSpan doesn’t seem to be affected as much with the log2 transformation, staying relatively constant for all scores, while it got inverted in Gandal’s, which in turn is a consequence of the relation between the mean and standard deviation of the probes.

p1 = ggplotly(new_DE_info_Gandal %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() + 
              scale_color_manual(values=gg_colour_hue(9)) + theme_minimal() + theme(legend.position = 'none') +
              ggtitle('logFC in Gandal dataset (left) and in BrainSpan (right)'))

p2 = ggplotly(new_DE_info_BrainSpan %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() + 
              scale_color_manual(values=gg_colour_hue(9)) + theme_minimal() + theme(legend.position = 'none') +
              ggtitle('logFC in Gandal dataset (left) and in BrainSpan (right)'))

subplot(p1, p2, nrows=1)
rm(p1, p2)

There isn’t a strong correlation between logFC in the two datasets

plot_data = new_DE_info_Gandal %>% left_join(new_DE_info_BrainSpan, by='ID') %>% 
            mutate('gene-score'=as.factor(ifelse(is.na(`gene-score.x`), 'NA', `gene-score.x`)))

corr = cor(plot_data$logFC.x, plot_data$logFC.y)
ggplotly(plot_data %>% ggplot(aes(logFC.x, logFC.y, color=`gene-score`)) + geom_point(alpha=0.2) + 
         geom_hline(yintercept=log2(1.2), color='#cc0099') + geom_hline(yintercept=-log2(1.2), color='#cc0099') + 
         geom_vline(xintercept=log2(1.2), color='#cc0099') + geom_vline(xintercept=-log2(1.2), color='#cc0099') +
         ggtitle(paste0('logFC comparison by gene corr=', round(corr,4))) + scale_color_manual(values=gg_colour_hue(9)) + 
         xlab('Gandal') + ylab('BrainSpan') + theme_minimal())
rm(corr)
for(score in sort(unique(plot_data$`gene-score`))){
  corr = cor(plot_data$logFC.x[plot_data$`gene-score`==score], plot_data$logFC.y[plot_data$`gene-score`==score])
  print(paste0('Correlation = ', round(corr,4), ' for SFARI score ', score))
}
## [1] "Correlation = -0.4944 for SFARI score 1"
## [1] "Correlation = -0.0112 for SFARI score 2"
## [1] "Correlation = -0.131 for SFARI score 3"
## [1] "Correlation = -0.152 for SFARI score 4"
## [1] "Correlation = -0.2001 for SFARI score 5"
## [1] "Correlation = -0.1127 for SFARI score 6"
## [1] "Correlation = -0.0749 for SFARI score NA"

Concordance in significance of genes based in log Fold Change

table(plot_data$signif_lfc.x, plot_data$signif_lfc.y)/nrow(plot_data)*100
##        
##            FALSE     TRUE
##   FALSE 64.83931 12.51352
##   TRUE  20.48016  2.16701


For both datasets, almost no genes have a significant adjusted p-value (makes sense because the classifier has no biological meaning)

summary(plot_data$signif_p_val.x)
##    Mode   FALSE 
## logical   28657
summary(plot_data$signif_p_val.y)
##    Mode   FALSE    TRUE 
## logical   28656       1